{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0bb1cbb2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pandas import Series, DataFrame\n",
    "import time"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "33611a51",
   "metadata": {},
   "source": [
    "# Beyond 1\n",
    "\n",
    "If we read the CSV file using the \"pyarrow\" engine, do we see any speedup? That is, can we read CSV files into memory any faster if we use a different engine?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b95a0fc2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reading via pyarrow engine, total_time=9.923564148019068\n"
     ]
    }
   ],
   "source": [
    "filename = '../data/nyc-parking-violations-2020.csv'\n",
    "start_time = time.perf_counter()\n",
    "df = pd.read_csv(filename, engine='pyarrow')\n",
    "end_time = time.perf_counter()\n",
    "total_time = end_time - start_time\n",
    "print(f'Reading via pyarrow engine, {total_time=}')    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "54c0002c",
   "metadata": {},
   "source": [
    "# Beyond 2\n",
    "\n",
    "If we specify the dtypes when reading from a CSV file, do we save any time?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "87edccd4",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "63.521172957960516"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "start_time = time.perf_counter()\n",
    "df = pd.read_csv(filename, low_memory=False,\n",
    "                 dtype=dict(df.dtypes))\n",
    "end_time = time.perf_counter()\n",
    "\n",
    "total_time = end_time - start_time\n",
    "total_time"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "261622c5",
   "metadata": {},
   "source": [
    "# Beyond 3\n",
    "\n",
    "How much memory does our data frame take in as a `pandas` data frame? How much memory does it require as an Arrow table?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "14842cf1",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "16,789,335,057\n"
     ]
    }
   ],
   "source": [
    "# Pandas table\n",
    "n = df.memory_usage(deep=True).sum()\n",
    "print(f'{n:,}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "947e968a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Arrow table\n",
    "import pyarrow.feather as feather\n",
    "read_arrow = feather.read_table('parking-violations.feather')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a0d4be71",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4,309,680,899\n"
     ]
    }
   ],
   "source": [
    "n = read_arrow.nbytes\n",
    "print(f'{n:,}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5d110443-4287-47dc-b3ef-117930957cbf",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
